Read in the data

This section will read in the data. And print a summary

Initial transformations for some of the data to make

The following are the mapping according to the data dictionary. season_mapping = {1:“winter”, 2:“spring”, 3:“summer”, 4:“fall”} yr_mapping = {0: “2011”, 1:“2012”} mnth_mapping = {1:“Jan”,2:“Feb”,3:“Mar”,4:“Apr”,5:“May”,6:“Jun”,7:“Jul”,8:“Aug”,9:“Sep”,10:“Oct”,11:“Nov”,12:“Dec”} weekday_mapping={0: ‘Sunday’, 1: ‘Monday’, 2: ‘Tuesday’,3: ‘Wednesday’, 4: ‘Thursday’, 5: ‘Friday’, 6: ‘Saturday’} weather_mapping={1:“Clear”,2:“Misty”,3:“Light Snow/Rain”,4:“Heavy Snow/Rain”}

library(tidyverse)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
bikesharing <- read_csv("./Data/hour.csv")
## Parsed with column specification:
## cols(
##   instant = col_double(),
##   dteday = col_date(format = ""),
##   season = col_double(),
##   yr = col_double(),
##   mnth = col_double(),
##   hr = col_double(),
##   holiday = col_double(),
##   weekday = col_double(),
##   workingday = col_double(),
##   weathersit = col_double(),
##   temp = col_double(),
##   atemp = col_double(),
##   hum = col_double(),
##   windspeed = col_double(),
##   casual = col_double(),
##   registered = col_double(),
##   cnt = col_double()
## )
# yr_mapping = {0: "2011", 1:"2012"}
# mnth_mapping = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
# weekday_mapping={0: 'Sunday', 1: 'Monday', 2: 'Tuesday',3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
# weather_mapping={1:"Clear",2:"Misty",3:"Light Snow/Rain",4:"Heavy Snow/Rain"}

bikesharing %<>% mutate(
                        season=case_when(
                          season==1 ~ "winter",
                          season==2 ~ "spring",
                          season==3 ~ "summer",
                          season==4 ~ "fall",
                          TRUE~"NA"  ), 
                        yr = case_when(
                          yr==0 ~"2011",
                          yr==1 ~ "2012"
                        ),
                        mnth = case_when(
                          mnth==1 ~"Jan",
                          mnth==2 ~"Feb",
                          mnth==3 ~"Mar",
                          mnth==4 ~"Apr",
                          mnth==5 ~"May",
                          mnth==6 ~"Jun",
                          mnth==7 ~"Jul",
                          mnth==8 ~"Aug",
                          mnth==9 ~"Sep",
                          mnth==10 ~"Oct",
                          mnth==11 ~"Nov",
                          mnth==12 ~"Dec",
                          ),
                        weekday= case_when(
                          weekday==0 ~ "Sunday",
                          weekday==1 ~ "Monday",
                          weekday==2 ~ "Tuesday",
                          weekday==3 ~ "Wednesday",
                          weekday==4 ~ "Thursday",
                          weekday==5 ~ "Friday",
                          weekday==6 ~ "Saturday"
                        ),
                        weathersit= case_when(
                          weathersit==1 ~"Clear",
                          weathersit==2 ~"Misty",
                          weathersit==3 ~"Light Snow or Rain",
                          weathersit==4 ~"Heavy Snow or Rain"
                        )
                        
                          )
bikesharing %>% slice_sample(n=200)
## # A tibble: 200 x 17
##    instant dteday     season yr    mnth     hr holiday weekday workingday
##      <dbl> <date>     <chr>  <chr> <chr> <dbl>   <dbl> <chr>        <dbl>
##  1    1300 2011-02-27 winter 2011  Feb       8       0 Sunday           0
##  2    7159 2011-10-30 fall   2011  Oct      21       0 Sunday           0
##  3    9105 2012-01-20 winter 2012  Jan       6       0 Friday           1
##  4    7883 2011-11-30 fall   2011  Nov       2       0 Wednes…          1
##  5   17314 2012-12-29 winter 2012  Dec       6       0 Saturd…          0
##  6   15788 2012-10-25 fall   2012  Oct       0       0 Thursd…          1
##  7    6277 2011-09-24 fall   2011  Sep       2       0 Saturd…          0
##  8    5507 2011-08-22 summer 2011  Aug       8       0 Monday           1
##  9   13767 2012-08-01 summer 2012  Aug      19       0 Wednes…          1
## 10    5437 2011-08-19 summer 2011  Aug      10       0 Friday           1
## # … with 190 more rows, and 8 more variables: weathersit <chr>, temp <dbl>,
## #   atemp <dbl>, hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>,
## #   cnt <dbl>

Exploratory Data Analysis and Visualizations

We will plot the distribution plot of the registered and casual riders all up

bikeshare_plot<-bikesharing %>% pivot_longer(cols = c(registered,casual),names_to="RideType",values_to="NumRides") %>% select(RideType,NumRides)
fig <- ggplot(data = bikeshare_plot) +
  geom_density(mapping = aes(x=NumRides,fill=RideType),alpha=0.25) +
  scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
  ggtitle("Distribution of Riders",subtitle = "Casual Vs Registered") +
  xlab("Number of Rides")+
  ylab("Density")+
  theme_classic() +
  theme(
    legend.position = c(.95, .95),
    legend.justification = c("right", "top"),
    legend.box.just = "right",
    legend.margin = margin(6, 6, 6, 6)
    ) +
  theme(legend.title = element_blank()) +
  theme(plot.title = element_text(hjust = .5)) +
  theme(plot.subtitle = element_text(hjust = .5))
  
plot(x = fig)

ggsave("./figs/rides_distributions.png",plot = fig,device = "png")
rm(bikeshare_plot)

Plot the time series. Evolution of Rides by time.

plot_data<- bikesharing %>% select(dteday,registered,casual) %>% 
  group_by(dteday) %>% summarise(registered=sum(registered),casual=sum(casual)) %>% 
  pivot_longer(cols=c(registered,casual),names_to="RideType",values_to="NumberOfRides")

fig<-ggplot(data = plot_data) + 
  geom_line(mapping = aes(x=dteday,y=NumberOfRides,color=RideType))+
  ggtitle("Number of Riders by day")+
  xlab("Date") +
  ylab("Number of Riders") +
  theme_classic()+
  theme(
    legend.position = c(1, 1),
    legend.justification = c("right", "top"),
    legend.box.just = "right",
    legend.margin = margin(0, 0, 0, 0),
    legend.title = element_blank()
  ) +
  theme(plot.title = element_text(hjust = 0.5))
fig

ggsave("./figs/rides_daily.png",plot = fig,device = "png")
rm(plot_data)

PLotting Time Series with Rolling means and standard deviations

windowsize <- 7
plot_data<- bikesharing %>% select(dteday,registered,casual) %>% 
  group_by(dteday) %>% 
  summarise(sumreg=sum(registered),sumcas=sum(casual)) %>% 
  mutate(regrollm=rollmean(x=sumreg,k=windowsize,align="right",fill=NA),
         casrollm=rollmean(x=sumcas,k=windowsize,align="right",fill=NA),
         regrollsd=rollapplyr(data= sumreg,width=windowsize,FUN="sd",fill=NA),
         casrollsd=rollapplyr(data= sumcas,width=windowsize,FUN="sd",fill=NA)
           
         ) %>% select(dteday,registered=regrollm,registeredsd=regrollsd,casual=casrollm,casualsd=casrollsd) %>% filter(!is.na(registered)) 
plot_data_1<- plot_data %>% select(dteday,registered,casual) %>% 
  pivot_longer(cols = c("registered","casual"),names_to="RideType",values_to="RollingMeanRides")
plot_data_2<- plot_data %>% select(dteday,registeredsd,casualsd) %>% 
  pivot_longer(cols = c("registeredsd","casualsd"),names_to="RideType",values_to="RollingsdRides")

fig <- ggplot(data = plot_data_1) + geom_line(mapping = aes(x=dteday,y=RollingMeanRides,color=RideType)) +
  geom_ribbon(data=plot_data_2,mapping = aes(x=dteday,ymin=plot_data_1$RollingMeanRides-2*RollingsdRides,ymax=plot_data_1$RollingMeanRides+2*RollingsdRides,fill=RideType),alpha=0.25,show.legend = FALSE) +
  ggtitle("Number of Readers by Day",subtitle = "Smoothed by Rolling Average of 7 days")+
  ylab("Number of Riders")+
  xlab("Date") +
  theme_classic()+
  theme(
    legend.position = c(1, 1),
    legend.justification = c("right", "top"),
    legend.box.just = "right",
    legend.margin = margin(0, 0, 0, 0),
    legend.title = element_blank()
  )+
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.58))  
  
fig

ggsave("./figs/rolling_rides_daily.png",plot = fig,device = "png")
rm(list=c("plot_data_1","plot_data_2"))

distribution of bike rides during a day and then during days of a week

We will draw both the total and the average of riders by hour of the day by season.

plot_data<-bikesharing %>% pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="NumberOfRides") %>% 
  select(hr,season,RideType,NumberOfRides) %>% mutate(season=factor(season,levels=c("winter","spring","summer","fall")),RideType=factor(RideType,levels=c("registered","casual")), hr=factor(hr))

fig<-ggplot(data = plot_data) + 
  geom_col(mapping = aes(x=hr,y=NumberOfRides),fill="lightblue")+ 
  scale_x_discrete(breaks=seq(0,23))+
  facet_grid(rows = vars(season),cols = vars(RideType),switch = "y") +
  ggtitle("Hourly Distribution of Rides by Season") +
  xlab("Hour of the Day")+
  ylab("Number of Rides")+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))
fig

ggsave("./figs/totalregisteredVsCasualridersbyhourbyseason.png",plot = fig,device = "png")
rm(plot_data)
plot_data_average <- 
  bikesharing %>% group_by(hr,season) %>% summarise(registered=mean(registered),casual=mean(casual)) %>% 
  pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="AverageRides") %>% 
  select(hr,season,RideType,AverageRides) %>% 
  mutate(season=factor(season,levels=c("winter","spring","summer","fall"))
         ,RideType=factor(RideType,levels=c("registered","casual"))
         , hr=factor(hr))

fig<-ggplot(data = plot_data_average) + 
  geom_col(mapping = aes(x=hr,y=AverageRides),fill="lightblue")+ 
  scale_x_discrete( breaks=seq(0,23))+
  facet_grid(rows = vars(season),cols = vars(RideType),switch = "y") +
  ggtitle("Hourly distribution of Average Number of Rides by Season")+
  ylab("Average Number of Rides")+
    xlab("Hour of the Day") +
  theme_classic()
  
fig

ggsave("./figs/AverageregisteredVsCasualridersbyhourbyseason.png",plot = fig,device = "png")
rm(plot_data_average)

We will check for distribution of rides by day of week by season

plot_data<-bikesharing %>% 
  pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="NumberOfRides") %>% 
  select(weekday,season,RideType,NumberOfRides) %>% 
  mutate(
        season=factor(season,levels=c("winter","spring","summer","fall"))
        ,RideType=factor(RideType,levels=c("registered","casual"))
        ,weekday=factor(weekday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
        )
  
fig<-ggplot(data = plot_data) +
  geom_col(mapping = aes(x=weekday,y=NumberOfRides),fill="lightblue") +
  facet_grid(rows = vars(season),cols = vars(RideType)) +
  scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
  scale_x_discrete(guide = guide_axis(n.dodge = 2))+
  ggtitle("Weekly Rides by Season")+
  xlab("Day of the Week")+
  ylab("Total Number of Rides")+
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))
plot(x=fig)

ggsave("./figs/NumRidesBySeason.png",device = "png",plot = fig)
rm(plot_data)

Now the plot of Average Number of Rides by Weekday and Season

plot_data <- bikesharing %>% group_by(weekday,season) %>% 
             summarise(registered=mean(registered),casual=mean(casual)) %>% 
            pivot_longer(cols = c(registered,casual),names_to="RideType",values_to="AverageRides") %>% 
  mutate(season=factor(season,levels=c("winter","spring","summer","fall"))
         ,RideType=factor(RideType,levels=c("registered","casual"))
         ,weekday=factor(weekday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
  )
            
fig <- ggplot(data = plot_data) +
  geom_col(mapping = aes(x=weekday,y=AverageRides),fill="lightblue") +
  facet_grid(rows = vars(season),cols = vars(RideType)) +
  scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
  ggtitle("Average Rides by Day of the Week and Season") +
  xlab("Day of the Week") +
  ylab("Average Number of Rides") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))
plot(fig)

ggsave("./figs/AverageRidesByDaysofWeekAndSeason.png",device = "png",plot = fig)
rm(plot_data)

hypothesis testing using T-Tests

We will be using statistical tests to determine whether 2 groups are different. We will be testing whether 1. There is a difference between Registered Riders on the weekend vs weekdays 2. There is a difference between Casual Riders on the weekend vs weekdays 3. Is there a difference between Registered riders between various weekdays 4. Is there a difference between Casual riders between various weekdays

T-Test for cases 1 and 2 above in a single command.

ttestData<-bikesharing %>% 
  select(weekday,registered,casual) %>% 
  mutate(IsWeekDay=case_when(
    weekday %in% c("Saturday","Sunday") ~ "No",
    TRUE~"Yes"
  )) %>% 
  select(IsWeekDay,casual,registered) %>% 
  pivot_longer(cols=c(registered,casual),names_to="RideType",values_to="NumRides")

testResults<-ttestData %>% 
  group_by(RideType,IsWeekDay) %>% 
  nest() %>% 
  pivot_wider(names_from = IsWeekDay,values_from=data) %>% 
  mutate(
    t_test = map2(No, Yes, ~{t.test(.x$NumRides, .y$NumRides) %>% tidy()})
    
  ) %>%
  unnest(c(t_test)) %>% 
  select(RideType,statistic,p.value)

testResults
## # A tibble: 2 x 3
## # Groups:   RideType [2]
##   RideType   statistic   p.value
##   <chr>          <dbl>     <dbl>
## 1 registered     -19.0 1.51e- 79
## 2 casual          30.4 1.70e-188
rm(ttestData,testResults)

Looking at the p values and at .05 Confidence Level we can safely reject the NULL Hypothesis that means are identical. The alternative hypothesis is that the means are different . We can conclude that based on the data there is a difference between the number of users across categories between weekdays and weekends.

I will get back with differences of users between weekdays. Given this is count data how do we fit other methods to do the test.

plot_data<-bikesharing %>%
  pivot_longer(cols=c("registered","casual"),names_to="RideType",values_to="NumRides") %>% 
  select(temp,atemp,hum,windspeed,RideType,NumRides)
  
  plotscatters<-function(colname,busname)
  {
    coltoplot<-ensym(colname)
      fig<-ggplot(data = plot_data) +
      geom_point(mapping = aes(x=!!coltoplot,y=NumRides,color=RideType),alpha=.25) +
      geom_smooth(mapping = aes(x=!!coltoplot,y=NumRides,color=RideType),method = "lm",alpha=.25) +
      ggtitle(paste("Plot of Rides to", busname)) +
      theme_classic()+
      theme(plot.title = element_text(hjust = 0.5))+
      theme(legend.title = element_blank())+
      ylab("Number of Rides") +
      xlab(busname)
      ggsave(paste("./figs/",busname,".png"),device = "png",plot = fig)
      return(fig)
    
  }
  
  varlis<-c("temp","atemp","hum","windspeed")
  buslis<-c("Percieved Temperature","Actual Temperature","Humidity","Wind Speed")
  figlist<-map2(varlis,buslis,plotscatters)
  for (figs in figlist) {
    plot(figs)
    
  }

  rm(plot_data)